import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
#reading the dataset
vehicle_df = pd.read_csv('vehicle.csv')
#First few entries of dataset
vehicle_df.head(10)
vehicle_df.info()
Attribute Information:
ATTRIBUTES
COMPACTNESS (average perim)**2/area
CIRCULARITY (average radius)**2/area
DISTANCE CIRCULARITY area/(av.distance from border)**2
RADIUS RATIO (max.rad-min.rad)/av.radius
PR.AXIS ASPECT RATIO (minor axis)/(major axis)
MAX.LENGTH ASPECT RATIO (length perp. max length)/(max length)
SCATTER RATIO (inertia about minor axis)/(inertia about major axis)
ELONGATEDNESS area/(shrink width)**2
PR.AXIS RECTANGULARITY area/(pr.axis length*pr.axis width)
MAX.LENGTH RECTANGULARITY area/(max.length*length perp. to this)
SCALED VARIANCE (2nd order moment about minor axis)/area ALONG MAJOR AXIS
SCALED VARIANCE (2nd order moment about major axis)/area ALONG MINOR AXIS
SCALED RADIUS OF GYRATION (mavar+mivar)/area
SKEWNESS ABOUT (3rd order moment about major axis)/sigma_min**3 MAJOR AXIS
SKEWNESS ABOUT (3rd order moment about minor axis)/sigma_maj**3 MINOR AXIS
HOLLOWS RATIO (area of hollows)/(area of bounding polygon)
Where sigma_maj2 is the variance along the major axis and sigma_min2 is the variance along the minor axis, and
area of hollows= area of bounding poly-area of object
The area of the bounding polygon is found as a side result of the computation to find the maximum length. Each individual length computation yields a pair of calipers to the object orientated at every 5 degrees. The object is propagated into an image containing the union of these calipers to obtain an image of the bounding polygon.
NUMBER OF CLASSES 3
VAN, BUS, CAR
vehicle_df.describe(include = 'all').transpose()
vehicle_df.isnull().count()
#Copying the dataset into another dataframe
vehicle_df1 = vehicle_df.copy(deep = True)
vehicle_df1.head()
#Replacing the NaN values with the median values as mean is sensitive to outliers
columns_missingvalue = ['compactness','circularity','distance_circularity','radius_ratio','pr.axis_aspect_ratio',
'max.length_aspect_ratio','scatter_ratio','elongatedness','pr.axis_rectangularity',
'max.length_rectangularity','scaled_variance','scaled_variance.1','scaled_radius_of_gyration',
'scaled_radius_of_gyration.1','skewness_about','skewness_about.1','skewness_about.2','hollows_ratio'
]
for i in columns_missingvalue:
median = vehicle_df1[i].median()
vehicle_df1[i].replace(np.nan,median,inplace = True)
vehicle_df1.info()
# Label encoding of the target class is done as most of the models require the attributes and target variables to be numbers
vehicle_df1['class'] = pd.Categorical(vehicle_df1['class']).codes
vehicle_df1.info()
vehicle_df1.head(10)
vehicle_df1.plot(kind = 'box',figsize = (30,30),subplots = True, layout = (5,4))
columns_outlier = ['compactness','circularity','distance_circularity','radius_ratio','pr.axis_aspect_ratio',
'max.length_aspect_ratio','scatter_ratio','elongatedness','pr.axis_rectangularity',
'max.length_rectangularity','scaled_variance','scaled_variance.1','scaled_radius_of_gyration',
'scaled_radius_of_gyration.1','skewness_about','skewness_about.1','skewness_about.2','hollows_ratio'
]
for i in columns_outlier:
dp75, dp25 = np.percentile(vehicle_df1[i], [75 ,25])
IQR = dp75 - dp25
Up_range = dp75 + 1.5*IQR
Low_range = dp25 - 1.5*IQR
median = vehicle_df1[i].median()
print('Number of upper range outliers is : ' ,vehicle_df1[vehicle_df1[i] > Up_range][i].shape[0])
print('Number of Lower range outliers is : ' , vehicle_df1[vehicle_df1[i] < Low_range][i].shape[0])
vehicle_df1.loc[(vehicle_df1[i] > Up_range),i] = median
vehicle_df1.loc[(vehicle_df1[i] < Low_range),i] = median
print('All outliers are replaced of ', i)
print('*'*100)
# Viewing the dataframe after eliminating outliers by replacing with median values
vehicle_df1.head(10)
# Checking boxplots for removal of outliers
vehicle_df1.plot(kind = 'box',figsize = (30,30),subplots = True, layout = (5,4))
#For processing of data
from sklearn import preprocessing
x_array = np.array(vehicle_df1['compactness'])
norm_compactness = preprocessing.normalize([x_array])
x_array1 = np.array(vehicle_df1['circularity'])
norm_circularity = preprocessing.normalize([x_array1])
x_array2 = np.array(vehicle_df1['distance_circularity'])
norm_distance_circularity = preprocessing.normalize([x_array2])
x_array3 = np.array(vehicle_df1['radius_ratio'])
norm_radius_ratio = preprocessing.normalize([x_array3])
x_array4 = np.array(vehicle_df1['pr.axis_aspect_ratio'])
norm_praxis_aspect_ratio = preprocessing.normalize([x_array4])
x_array5 = np.array(vehicle_df1['max.length_aspect_ratio'])
norm_maxlength_aspect_ratio = preprocessing.normalize([x_array5])
x_array6 = np.array(vehicle_df1['scatter_ratio'])
norm_scatter_ratio = preprocessing.normalize([x_array6])
x_array7 = np.array(vehicle_df1['elongatedness'])
norm_elongatedness = preprocessing.normalize([x_array7])
x_array8 = np.array(vehicle_df1['pr.axis_rectangularity'])
norm_praxis_rectangularity = preprocessing.normalize([x_array8])
x_array9 = np.array(vehicle_df1['max.length_rectangularity'])
norm_maxlength_rectangularity = preprocessing.normalize([x_array9])
x_array10 = np.array(vehicle_df1['scaled_variance'])
norm_scaled_variance = preprocessing.normalize([x_array10])
x_array11 = np.array(vehicle_df1['scaled_variance.1'])
norm_scaled_variance1 = preprocessing.normalize([x_array11])
x_array12 = np.array(vehicle_df1['scaled_radius_of_gyration'])
norm_scaled_radius_of_gyration = preprocessing.normalize([x_array12])
x_array13 = np.array(vehicle_df1['scaled_radius_of_gyration.1'])
norm_scaled_radius_of_gyration1 = preprocessing.normalize([x_array13])
x_array14 = np.array(vehicle_df1['skewness_about'])
norm_skewness_about = preprocessing.normalize([x_array14])
x_array15 = np.array(vehicle_df1['skewness_about.1'])
norm_skewness_about1 = preprocessing.normalize([x_array15])
x_array16 = np.array(vehicle_df1['skewness_about.2'])
norm_skewness_about2 = preprocessing.normalize([x_array16])
x_array17 = np.array(vehicle_df1['hollows_ratio'])
norm_hollows_ratio = preprocessing.normalize([x_array17])
x_array18 = np.array(vehicle_df1['class'])
vehicle_df_norm = pd.DataFrame({'compactness': norm_compactness[0,:], 'circularity': norm_circularity[0,:], 'distance_circularity': norm_distance_circularity[0,:], 'radius_ratio': norm_radius_ratio[0,:],
'pr.axis_aspect_ratio' : norm_praxis_aspect_ratio[0,:], 'max.length_aspect_ratio' : norm_maxlength_aspect_ratio[0,:], 'scatter_ratio' : norm_scatter_ratio[0,:],
'elongatedness' : norm_elongatedness[0,:], 'pr.axis_rectangularity' : norm_praxis_rectangularity[0,:], 'max.length_rectangularity' : norm_maxlength_rectangularity[0,:],
'scaled_variance' : norm_scaled_variance[0,:], 'scaled_variance.1' : norm_scaled_variance1[0,:], 'scaled_radius_of_gyration' : norm_scaled_radius_of_gyration[0,:],
'scaled_radius_of_gyration.1' : norm_scaled_radius_of_gyration1[0,:], 'skewness_about' : norm_skewness_about[0,:],'skewness_about.1' : norm_skewness_about1[0,:],
'skewness_about.2' : norm_skewness_about2[0,:], 'hollows_ratio' : norm_hollows_ratio[0,:], 'class' : x_array18
})
vehicle_df_norm.head()
vehicle_df_norm.hist(figsize = (20,20))
plt.figure(figsize = (20,20))
sns.pairplot(vehicle_df_norm,diag_kind = 'kde',hue = 'class')
corr = vehicle_df_norm.corr()
corr
plt.figure(figsize = (20,20))
sns.heatmap(corr,annot = True,linewidth = 1)
print(vehicle_df_norm['class'].value_counts())
sns.countplot(x = 'class', data = vehicle_df_norm)
col = vehicle_df_norm[:17]
for item in col:
plt.title('Class' + ' against ' + str(item))
sns.boxplot(x = vehicle_df_norm['class'], y = vehicle_df_norm[item], data = vehicle_df_norm)
plt.show()
#To split the data into train and test data set
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
#To check the accuracy of the model
from sklearn.metrics import accuracy_score
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
# for prepartation of classification report
from sklearn.metrics import classification_report,make_scorer
# for drawing roc and aoc curves
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score,auc
# for ensemble techniques
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
X = vehicle_df_norm.iloc[:,:18]
y = vehicle_df_norm.iloc[:,18]
X.head(10)
y.head(10)
#y = label_binarize(y, classes=[0, 1, 2])
#n_classes = y.shape[1]
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.stats import zscore
sc = StandardScaler()
X_std = sc.fit_transform(X)
# Obtaining the co-variance matrix
cov_matrix = np.cov(X_std.T)
print('The Covariance matrix is \n%s', cov_matrix)
# Step 2 : Calculating eigenvalues and eigenvectors
eigenvalues,eigenvectors = np.linalg.eig(cov_matrix)
print('Eigen values are \n%s', eigenvalues)
print('Eigen Vectors are \n%s', eigenvectors)
# Step 3 : Sort eigenvalues in descending order
# Make a set of (eigenvalue, eigenvector) pairs
eig_pairs = [(eigenvalues[index], eigenvectors[:,index]) for index in range(len(eigenvalues))]
# Sort the (eigenvalue, eigenvector) pairs from highest to lowest with respect to eigenvalue
eig_pairs.sort()
eig_pairs.reverse()
print(eig_pairs)
# Extract the descending ordered eigenvalues and eigenvectors
eigvalues_sorted = [eig_pairs[index][0] for index in range(len(eigenvalues))]
eigvectors_sorted = [eig_pairs[index][1] for index in range(len(eigenvalues))]
# Let's confirm our sorting worked, print out eigenvalues
print('Eigenvalues in descending order: \n%s' %eigvalues_sorted)
tot = sum(eigenvalues)
var_explained = [(i / tot) for i in sorted(eigenvalues, reverse=True)] # an array of variance explained by each
# eigen vector... there will be 9 entries as there are 9 eigen vectors)
cum_var_exp = np.cumsum(var_explained) # an array of cumulative variance. There will be 9 entries with 9 th entry
# cumulative reaching almost 100%
cum_var_exp
plt.bar(range(1,19), var_explained, alpha=0.5, align='center', label='individual explained variance')
plt.step(range(1,19),cum_var_exp, where= 'mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc = 'best')
plt.show()
P_reduced = np.array(eigvectors_sorted[0:10])
P_reduced
X_std_10D = np.dot(X_std,P_reduced.T)
X_std_10D
#The feature matrix with pricipal components
PCA_X = pd.DataFrame(X_std_10D)
PCA_X.head(10)
PCA_X.corr()
seed = 1000
X_train, X_test, y_train, y_test = train_test_split(PCA_X,y,test_size = 0.3,random_state = seed)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import label_binarize
from scipy import interp
from itertools import cycle
# Defining Confusion Matrix
def confusion(x,y):
cm = confusion_matrix(x,y)
f, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, ax=ax)
plt.title("Confusion Matrix", fontsize=20)
plt.subplots_adjust(left=0.1, right=0.5, bottom=0.11, top=0.5)
ax.set_yticks(np.arange(cm.shape[0]) + 0.5, minor=False)
ax.set_xticklabels("")
#ax.set_yticklabels(['Refused T. Deposits', 'Accepted T. Deposits'], fontsize=10, rotation=360)
plt.show()
# Initiating Models
clfLR = LogisticRegression(random_state = seed, class_weight = None)
clfNB = GaussianNB()
clfKNN = KNeighborsClassifier()
clfDT = DecisionTreeClassifier(random_state = seed, class_weight = None)
clfSVC = SVC(random_state = seed, class_weight = None)
clfRF = RandomForestClassifier(n_estimators = 18)
clfGB = GradientBoostingClassifier()
# Creating the matrix of models
Models = []
Models.append(('Logistic Regression', clfLR))
Models.append(('Naive Bayes', clfNB))
Models.append(('K-Nearest Neighbour',clfKNN))
Models.append(('Decision Tree', clfDT))
Models.append(('Support Vector Classifier', clfSVC))
Models.append(('Random Forest',clfRF))
Models.append(('Gradeint Boosting', clfGB))
#Training and Cross validation of different models with confusion matrix, classification report and AUC of ROC-AUC curve
results = []
names = []
mean_score = []
fpr = dict()
#initialising the kfold value for Logistic Regression
for name,model in Models:
kfold = KFold(n_splits=5, shuffle=True, random_state = seed)
cv_results = cross_val_score(model,X_train, y_train,cv = kfold, scoring = 'accuracy')
cv_predict_train = cross_val_predict(model,X_train,y_train,cv = kfold)
confusion(y_train,cv_predict_train)
cr = classification_report(y_train, cv_predict_train)
print(cr)
results.append(cv_results)
names.append(name)
mean_score.append(cv_results.mean())
msg = "%s:Mean of cross validation accuracy %f" % (name,cv_results.mean())
print(msg)
#Defining dataframe of classifiers with respective cross validation score mean and AUC of ROC-AUC curve
dtrain = {'Classifiers': ['Logistic Regession', 'Naive Bayes', 'K Nearest Neighbor', 'Decision Tree', 'Support Vector Classifier',
'Random Forest', 'Gradient Boosting'],
'Cross Val Mean Score' : np.array(mean_score),
# 'Area Under ROC Curve' : np.array(AUC)
}
Classifier_train = pd.DataFrame(data = dtrain)
Classifier_train
Models
# Displaying models in the ascending order of their mean scores in cross validation
Classifier_train = Classifier_train.sort_values(by=['Cross Val Mean Score'], ascending=False)
Classifier_train
# Testing the models
Models = []
Models.append(('Logistic Regression', clfLR))
Models.append(('Naive Bayes', clfNB))
Models.append(('K-Nearest Neighbour',clfKNN))
Models.append(('Decision Tree', clfDT))
Models.append(('Support Vector Classifier', clfSVC))
Models.append(('Random Forest',clfRF))
Models.append(('Gradeint Boosting', clfGB))
#Fitting the models trained, testing, their confusion matrix, classification report
names = []
score_test = []
for name,model in Models :
model.fit(X_test,y_test)
y_predict_test = model.predict(X_test)
confusion(y_test,y_predict_test)
test_score = model.score(X_test,y_test)
score_test.append(test_score)
cr = classification_report(y_test,y_predict_test)
print(cr)
msg = "%s : Test Score is %.2f" %(name,test_score)
print(msg)
print('*' *100)
#Defining data frame of Models and their test scores
dtest = {'Classifiers': ['Logistic Regession', 'Naive Bayes', 'K Nearest Neighbor', 'Decision Tree', 'Support Vector Classifier',
'Random Forest', 'Gradient Boosting'],
'Test Score' : np.array(score_test)
}
Classifier_test = pd.DataFrame(data = dtest)
#Defining dataset for test score and displaying in the order of test score
Classifier_test = Classifier_test.sort_values(by=['Test Score'], ascending=False)
Classifier_test
#import gridsearch from sklearn
from sklearn.model_selection import GridSearchCV
LOGISTIC REGRESSION
#Initial Logistic Tree Classifier
clfLR
param_grid = {
'C' : [0.01,0.1,1,10,100]
}
grid_search_LR = GridSearchCV(estimator = clfLR, param_grid = param_grid, scoring = None, cv = 3, n_jobs = -1, verbose = 3)
grid_search_LR.fit(X_train,y_train)
grid_search_LR.best_estimator_
grid_search_LR.best_score_
y_predict_gs_LR = grid_search_LR.predict(X_test)
confusion(y_test,y_predict_gs_LR)
print(classification_report(y_test,y_predict_gs_LR))
DECISION TREE CLASSIFIER
# Initial Decision Tree Classifier
clfDT
param_grid = {
'criterion': ['gini', 'entropy'],
'min_samples_split': [2,4,10,20],
'min_samples_leaf': [1,5,10,20],
'max_depth': [2,5,10,20]
}
grid_search = GridSearchCV(estimator = clfDT, param_grid = param_grid, scoring = 'accuracy', cv = 3, n_jobs = -1, verbose = 3)
grid_search.fit(X_train,y_train)
grid_search.best_estimator_
grid_search.best_score_
y_predict_gs = grid_search.predict(X_test)
confusion(y_test,y_predict_gs)
RANDOM FOREST
# Initial Random forest classifier
clfRF
param_grid = { "criterion" : ["gini", "entropy"],
"min_samples_leaf" : [1, 5, 10],
"min_samples_split" : [2, 4, 10,],
"n_estimators": [50, 100, 400]
}
gs_rf = GridSearchCV(estimator=clfRF, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1, verbose = 1)
gs_rf=gs_rf.fit(X_train, y_train)
gs_rf.best_estimator_
gs_rf.best_score_
y_predict_gs_rf = gs_rf.predict(X_test)
confusion(y_test,y_predict_gs_rf)
print(classification_report(y_test,y_predict_gs_rf))
SUPPORT VECTOR CLASSIFIER
#INITIAL SVC CLASSIFIER
clfSVC
param_grid = {'C': [0.1,1, 10, 100],
'gamma': [1,0.1,0.01,0.001],
'kernel': ['rbf','sigmoid']}
gs_svc = GridSearchCV(estimator=clfSVC, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1,verbose=3)
gs_svc.fit(X_train,y_train)
gs_svc.best_estimator_
gs_svc.best_score_
y_predict_gs_svc = gs_svc.predict(X_test)
confusion(y_test,y_predict_gs_svc)
print(classification_report(y_test,y_predict_gs_svc))
from sklearn.ensemble import BaggingClassifier
Models = []
Models.append(('Logistic Regression', clfLR))
Models.append(('Naive Bayes', clfNB))
Models.append(('K-Nearest Neighbour',clfKNN))
Models.append(('Decision Tree', clfDT))
Models.append(('Support Vector Classifier', clfSVC))
Models.append(('Random Forest',clfRF))
Models.append(('Gradeint Boosting', clfGB))
name = []
Bagging_score = []
for name,model in Models:
model_bagging = BaggingClassifier(base_estimator = model,n_estimators=50,max_samples=.7 , oob_score=True,random_state = seed,n_jobs = -1)
y_predict_bagging = model_bagging.fit(X_train,y_train)
score_bagging = model_bagging.oob_score_
Bagging_score.append(score_bagging)
print('Bagging score of model %s is %.3f' %(name,score_bagging))
Bagging_score
d_test_bagging = {'Classifiers': ['Logistic Regession', 'Naive Bayes', 'K Nearest Neighbor', 'Decision Tree', 'Support Vector Classifier',
'Random Forest', 'Gradient Boosting'],
'Test Score' : np.array(score_test),
'Bagging Score' : np.array(Bagging_score)
}
Classifier_test_bagging = pd.DataFrame(data = d_test_bagging)
Classifier_test_bagging
Models
from sklearn.ensemble import AdaBoostClassifier
Models = []
Models.append(('Logistic Regression', clfLR))
Models.append(('Naive Bayes', clfNB))
Models.append(('Decision Tree', clfDT))
name = []
Boosting_score = []
for name,model in Models:
model_boosting = AdaBoostClassifier(base_estimator=model, n_estimators=50, random_state = seed,algorithm = 'SAMME.R')
model_boosting.fit(X_train,y_train)
score_boosting = model_bagging.oob_score_
Boosting_score.append(score_boosting)
print('Boosting score of model %s is %.3f' %(name,score_boosting))
d_test_boosting = {'Classifiers': ['Logistic Regession', 'Naive Bayes', 'Decision Tree'],
'Boosting Score' : np.array(Boosting_score)
}
Classifier_test_boosting = pd.DataFrame(data = d_test_boosting)
Classifier_test_boosting
Models